# Canen and Martin (2019)

# This script takes our main dataset, merges it with covariates (from the data on balance tests)
# and drops ads which had an inbalance in age/hispanic (drops those with p-values<=0.05 for 
# H0: no diff in covariate across T/C))

library(data.table)
library(magrittr)
library(tidyverse)
library(stargazer)

### SET WORKING DIRECTORY HERE ###
path_to_archive <- "replication/"
data_dir <- paste0(path_to_archive, "data/")
setwd(data_dir)
plot_dir <- paste0(path_to_archive, "plots/")
tables_dir <- paste0(path_to_archive, "tables/")

### FUNCTION DEFINITIONS FOR WEIGHTING SCHEMES
diff2 <- function(x) x[seq(1,length(x)-1,2)] - x[seq(2,length(x),2)]

inv_var_weight <- function(i,d) {
	weighted.mean(d[i, est], w=sqrt(d[i, n_t] * d[i, n_c]), na.rm=T)
}

total_size_weight <- function(i, d) {
	weighted.mean(d[i,est], w=d[i,n_t] + d[i,n_c], na.rm=T)
}

simple_avg <- function(i,d){
	mean(d[i,est], na.rm=T)
}


# read ad-level pre/post viewing data
# and balance tests at ad level
all_dd_data <- readRDS(paste0(data_dir, "all_dd_data.rds"))
load(paste0(data_dir, "ads_dd_balance_all.RData"))

balance_all %<>% 
  as.data.table %>%
  setnames(old="event_time_utc", new = "ad_time_utc")

# drop ads with p-values<0.05 in hispanic, age balance checks.
hispanic_balanced <- balance_all[variable=="hispanic" & t_eff_p>0.05, .(dma_code, channel, affiliate, program, ad_time_utc)]
age_balanced <- balance_all[variable=="age" & t_eff_p>0.05, .(dma_code, channel, affiliate, program, ad_time_utc)]

dd_hispanic_balanced <- all_dd_data[hispanic_balanced, on = c("dma_code", "channel", "affiliate", "program", "ad_time_utc"), nomatch=0]
dd_age_balanced <- all_dd_data[age_balanced, on = c("dma_code", "channel", "affiliate", "program", "ad_time_utc"), nomatch=0]

# 1) redo main estimates for the hispanic-balanced set
dd_long <- dd_hispanic_balanced %>% 
  gather(key="hour", val="view_sum", dplyr::starts_with("s_pre"), dplyr::starts_with("s_post")) %>%
  as.data.table

# collapse to combined C group
dd_long <- dd_long[,`:=` (groupTC = if_else(group == "T", "T", "C"), n_c = n_c1 + n_c2 + n_c12)] %>%
  .[,.(view_sum = sum(view_sum)), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, hour, group = groupTC)]


dd_long[, `:=`(view_mean = view_sum / ifelse(group=="C", n_c, n_t),
         pre_post = sub("s_(pre|post)_(\\d+)", "\\1", hour),
         hour = sub("s_(pre|post)_(\\d+)", "\\2", hour) %>% as.numeric)]

dd <- 
  dd_long[, .(news_min = sum(view_mean)/60), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post)] %>%
  setkey(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post) %>%
  .[, .(est = -diff2(diff2(.SD$news_min))) ,by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c)]


pt_est_hispbalanced <- inv_var_weight(1:nrow(dd),dd)
set.seed(1850159)
bs_hispbalanced <- map_dbl(1:1000, ~ inv_var_weight(sample.int(nrow(dd), replace=T), dd))
results_hispbalanced <- data.table(replicate = 0:1000, estimate = c(pt_est_hispbalanced, bs_hispbalanced))

pt_est_hispbalanced_simple <- simple_avg(1:nrow(dd),dd)
set.seed(1850159)
bs_hispbalanced_simple <- map_dbl(1:1000, ~ simple_avg(sample.int(nrow(dd), replace=T), dd))
results_hispbalanced_simple <- data.table(replicate = 0:1000, estimate = c(pt_est_hispbalanced_simple, bs_hispbalanced_simple))

pt_est_hispbalanced_size <- total_size_weight(1:nrow(dd),dd)
set.seed(1850159)
bs_hispbalanced_size <- map_dbl(1:1000, ~ total_size_weight(sample.int(nrow(dd), replace=T), dd))
results_hispbalanced_size <- data.table(replicate = 0:1000, estimate = c(pt_est_hispbalanced_size, bs_hispbalanced_size))

quantiles <- c(0.01,0.025,0.975,0.99)

ci_hispbalanced <- results_hispbalanced[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_hispbalanced_simple <- results_hispbalanced_simple[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_hispbalanced_size <- results_hispbalanced_size[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]

### TABLE C.8
y <- rnorm(100)
x <- matrix(rnorm(100*nrow(ci_hispbalanced)), nrow=100)

fake_model <- lm(y ~ x - 1)

dd_bootstrap_cis_hispbalanced <- stargazer(list(fake_model, fake_model, fake_model),
      title="Average Effects and Bootstrapped Confidence Intervals: Ads balanced on Hispanic share only.",
      style="apsr",
      out=paste0(tables_dir, "dd_bootstrap_cis_hispbalanced.tex"),
      label="tab:dd_boostrap_cis_balanced",
      coef=list(ci_hispbalanced[,point_est], ci_hispbalanced_size[,point_est], ci_hispbalanced_simple[,point_est]), 
      ci=T,
      ci.custom=list(ci_hispbalanced[, .(q_025, q_975)] %>% as.matrix,
               ci_hispbalanced_size[, .(q_025, q_975)] %>% as.matrix,
               ci_hispbalanced_simple[, .(q_025, q_975)] %>% as.matrix),
      star.cutoffs = NA,
      omit.stat = "all",
      column.labels = c("Inv. Variance", "Total Devices", "Equally Weighted"),
      dep.var.labels = "Effect (Minutes)",
      covariate.labels = "Treated",
      notes="\\parbox[t]{0.7\\linewidth}{Table reports average effects, weighted by 1) the geometric mean of treatment and control group size, 2) total devices in treatment and control groups and 3) a simple unweighted average. Confidence intervals are the central 95\\% interval of 1000 bootstrap replicates. The sample is restricted to the set of ads with p-value greater than 0.05 for the hypothesis that treatment and control groups have no difference in the fraction of Hispanic viewers.}",
      notes.append=F
      )


# 2) redo main estimates for the age-balanced set
dd_long <- dd_age_balanced %>% 
  gather(key="hour", val="view_sum", dplyr::starts_with("s_pre"), dplyr::starts_with("s_post")) %>%
  as.data.table

# collapse to combined C group
dd_long <- dd_long[,`:=` (groupTC = if_else(group == "T", "T", "C"), n_c = n_c1 + n_c2 + n_c12)] %>%
  .[,.(view_sum = sum(view_sum)), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, hour, group = groupTC)]


dd_long[, `:=`(view_mean = view_sum / ifelse(group=="C", n_c, n_t),
         pre_post = sub("s_(pre|post)_(\\d+)", "\\1", hour),
         hour = sub("s_(pre|post)_(\\d+)", "\\2", hour) %>% as.numeric)]

dd <- 
  dd_long[, .(news_min = sum(view_mean)/60), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post)] %>%
  setkey(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post) %>%
  .[, .(est = -diff2(diff2(.SD$news_min))) ,by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c)]


pt_est_agebalanced <- inv_var_weight(1:nrow(dd),dd)
set.seed(1850159)
bs_agebalanced <- map_dbl(1:1000, ~ inv_var_weight(sample.int(nrow(dd), replace=T), dd))
results_agebalanced <- data.table(replicate = 0:1000, estimate = c(pt_est_agebalanced, bs_agebalanced))

pt_est_agebalanced_simple <- simple_avg(1:nrow(dd),dd)
set.seed(1850159)
bs_agebalanced_simple <- map_dbl(1:1000, ~ simple_avg(sample.int(nrow(dd), replace=T), dd))
results_agebalanced_simple <- data.table(replicate = 0:1000, estimate = c(pt_est_agebalanced_simple, bs_agebalanced_simple))

pt_est_agebalanced_size <- total_size_weight(1:nrow(dd),dd)
set.seed(1850159)
bs_agebalanced_size <- map_dbl(1:1000, ~ total_size_weight(sample.int(nrow(dd), replace=T), dd))
results_agebalanced_size <- data.table(replicate = 0:1000, estimate = c(pt_est_agebalanced_size, bs_agebalanced_size))


quantiles <- c(0.01,0.025,0.975,0.99)

ci_agebalanced <- results_agebalanced[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_agebalanced_simple <- results_agebalanced_simple[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_agebalanced_size <- results_agebalanced_size[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]

### TABLE C.9 
y <- rnorm(100)
x <- matrix(rnorm(100*nrow(ci_agebalanced)), nrow=100)

fake_model <- lm(y ~ x - 1)

dd_bootstrap_cis_agebalanced <- stargazer(list(fake_model, fake_model, fake_model),
      title="Average Effects and Bootstrapped Confidence Intervals: Age-balanced ads only.",
      style="apsr",
      out=paste0(tables_dir, "dd_bootstrap_cis_agebalanced.tex"),
      label="tab:dd_boostrap_cis_balanced",
      coef=list(ci_agebalanced[,point_est], ci_agebalanced_size[,point_est], ci_agebalanced_simple[,point_est]), 
      ci=T,
      ci.custom=list(ci_agebalanced[, .(q_025, q_975)] %>% as.matrix,
               ci_agebalanced_size[, .(q_025, q_975)] %>% as.matrix,
               ci_agebalanced_simple[, .(q_025, q_975)] %>% as.matrix),
      star.cutoffs = NA,
      omit.stat = "all",
      column.labels = c("Inv. Variance", "Total Devices", "Equally Weighted"),
      dep.var.labels = "Effect (Minutes)",
      covariate.labels = "Treated",
      notes="\\parbox[t]{0.7\\linewidth}{Table reports average effects, weighted by 1) the geometric mean of treatment and control group size, 2) total devices in treatment and control groups and 3) a simple unweighted average. Confidence intervals are the central 95\\% interval of 1000 bootstrap replicates. The sample is restricted to the set of ads with p-value greater than 0.05 for the hypothesis that treatment and control groups have no difference in mean age.}",
      notes.append=F
      )